{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bb4f80ae-d4bf-48f0-802b-a932de6c9759",
   "metadata": {},
   "source": [
    "# Instruction difficulty\n",
    "\n",
    "This is the code to compute the instruction difficulty term from the length-controlled GLM. Note that the instruction difficulty term is not a crucial feature in the GLM, in particular the properties of the GLM hold regardless of how the instruction difficulty feature is computed (even if random). Having a sensible instruction dificulty slightly increases the correlation with LMSYS. More detail in this [GH issue](https://github.com/tatsu-lab/alpaca_eval/issues/346)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "77e923ce-1f87-4c6c-87f3-70c0bca767b8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/yanndubois/Desktop/GitHub/alpaca_eval\n"
     ]
    }
   ],
   "source": [
    "cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "406fc95c-073a-4952-818b-712ceac94c8f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from dotenv import load_dotenv\n",
    "load_dotenv(\".env\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "cc4760f6-fd06-4d2c-bced-f0bc0f63c4e5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yanndubois/opt/anaconda3/envs/alpaca_eval/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "%matplotlib inline\n",
    "\n",
    "from notebooks.notebook_helpers import *\n",
    "from alpaca_eval.metrics.glm_winrate import make_dmatrix_for_model, fit_LogisticRegressionCV\n",
    "sklearn.set_config(enable_metadata_routing=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "953043d0-d527-4717-bd89-0971b1d39211",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dropped 4 models\n",
      "We are comparing to 37 Arena models\n"
     ]
    }
   ],
   "source": [
    "lb = pd.read_csv(\"src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv\", index_col=0)#.query(\"index != 'gpt4_gamed'\")\n",
    "\n",
    "# remove all the models that have missing annotations\n",
    "indices_to_drop = []\n",
    "for i in lb.index:\n",
    "    if not Path(f\"results/{i}/weighted_alpaca_eval_gpt4_turbo/annotations.json\").is_file():\n",
    "        indices_to_drop.append(i)\n",
    "lb = lb.drop(indices_to_drop)\n",
    "print(f\"dropped {len(indices_to_drop)} models\")\n",
    "\n",
    "lb_arena = make_lb_arena(lb)\n",
    "print(f\"We are comparing to {len(lb_arena)} Arena models\")\n",
    "\n",
    "process_gamed_models_(lb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "fcd849ca-9454-46d1-9504-becc9aa05a30",
   "metadata": {},
   "outputs": [],
   "source": [
    "all_models = list(lb.index)\n",
    "ordered_models = [BASELINE] + [m for m in all_models if m != BASELINE]\n",
    "\n",
    "all_df_annotations = load_annotations(lb)\n",
    "all_df_annotations = all_df_annotations.query(\"len_2 != 0\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "667d94a4-2f48-4587-9ce7-c57abb0bb527",
   "metadata": {},
   "source": [
    "## Joint fitting to compute instruction_difficulty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "1c633004-a7f4-4d58-b75b-3f4d26e3fc2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "from patsy import build_design_matrices, dmatrix\n",
    "# same function as in glm_winrate.py but you want to be able to use local cariables\n",
    "def make_dmatrix_for_model(df_train, df_test, formula, col_y_true=\"preference\"):\n",
    "    df_XY_train = dmatrix(formula, df_train, return_type=\"dataframe\")\n",
    "    df_X_test = build_design_matrices([df_XY_train.design_info], df_test, return_type=\"dataframe\")[0]\n",
    "    df_XY_train[col_y_true] = df_train[col_y_true]  # adds the label\n",
    "    return df_XY_train, df_X_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "94523de9-629f-4f6b-9a16-74920f084d78",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_index_param_from_joint_optimization(df, df_lb, formula):\n",
    "    curr_df_lb = df_lb.copy()\n",
    "    df_input, df_input_lb = make_dmatrix_for_model(df, df_lb,formula=formula)\n",
    "    df_input[\"preference\"] = df[\"preference\"]\n",
    "    \n",
    "    model = fit_LogisticRegressionCV(df_input, \"preference\", is_ytrue_proba=True, n_splits=5)\n",
    "    curr_df_lb[\"preference\"] = model.predict_proba(df_input_lb)[:,1]\n",
    "    lb[formula] = curr_df_lb.groupby(\"generator_2\")[\"preference\"].mean()[lb.index] * 100\n",
    "    metrics = report(lb, formula, is_return_metrics=True)\n",
    "    \n",
    "    params = pd.Series(index=model.feature_names_in_, data=model.coef_[0]).to_frame()\n",
    "    idx_params = params.loc[[i for i in params.index if \"C(index)\" in i]]\n",
    "    idx_params.index = [int(i.split(\"[T.\")[-1].split(\"]\")[0]) for i in idx_params.index]\n",
    "    # add the missing index (because you use dummy encoding) => one index missing which has effectively weight 0\n",
    "    missing_idcs = set(df[\"index\"].unique()) - set(idx_params.index)\n",
    "    assert len(missing_idcs) == 1\n",
    "    missing_idx = missing_idcs.pop()\n",
    "    idx_params.loc[0,:] = 0\n",
    "    idx_params = idx_params.sort_index()\n",
    "    idx_params.columns = [\"param\"]\n",
    "    return idx_params, model, metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36f6d103-c620-4074-a2d1-f1c2e2ef6429",
   "metadata": {},
   "outputs": [],
   "source": [
    "# C(generator_2, levels=ordered_models): np.tanh(rand_delta_len_std_only), means that each model will have its own weight for \"np.tanh(rand_delta_len_std_only)\"\n",
    "# C(generator_2, levels=ordered_models) means that each model will have it's own bias term\n",
    "# C(index) means that each example (instruction) will have it's own weight. This is what will become the instruction difficulty\n",
    "joint_formula = f\"C(generator_2, levels=ordered_models): np.tanh(rand_delta_len_std_only) + C(generator_2, levels=ordered_models) + C(index)  + not_gamed_baseline.astype(float) - 1\"\n",
    "\n",
    "# init data without instruction difficulty \n",
    "df_init, df_lb_init = make_data(all_df_annotations)\n",
    "\n",
    "# compute join optimization to get instruction difficulty\n",
    "instruction_difficulty, model, metrics = get_index_param_from_joint_optimization(df_init, df_lb_init, formula = joint_formula, is_print=False)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7576712a-fd7a-4e44-909d-f5c4d2468dfd",
   "metadata": {},
   "source": [
    "## Validate instruction_difficulty"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3208d71-e283-4bfc-927f-ffd24e65a966",
   "metadata": {},
   "source": [
    "Now let's run the optimization with the newly computed and fixed instruction difficulty. I.e. fit all parameters disjointly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "d17aed31-78b3-4df0-b7f5-e30a82a1bce8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yanndubois/opt/anaconda3/envs/alpaca_eval/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4424: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n",
      "  warnings.warn(stats.ConstantInputWarning(msg))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'verbosity_gameability': 6.24064558743236,\n",
       " 'conciseness_gameability': 14.433044187827408,\n",
       " 'adversarial_rank_gain': 49.0,\n",
       " 'adversarial_winrate_gain': 9.620112879799999,\n",
       " 'corr_arena': 0.9745109730605707,\n",
       " 'corr_len': 0.2513188315718244,\n",
       " 'logloss': 0.2672176991151537,\n",
       " 'mse': 0.049212584861330455,\n",
       " 'r2': 0.42343866160090404,\n",
       " 'corr': 0.6400694636182821,\n",
       " 'acc': 0.8967795936782501}"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "formula=f\"np.tanh(rand_delta_len_std_only) + instruction_difficulty + not_gamed_baseline.astype(float) - 1\"\n",
    "\n",
    "# make data with join instruciton difficulty\n",
    "df, df_lb = make_data(all_df_annotations, instruction_difficulty=instruction_difficulty)\n",
    "\n",
    "# run the disjoint optimization. That's the only thing that we wil have to actually run\n",
    "metrics, models = disjoint_optimization_(lb, df, df_lb, formula=formula, regularize_to_baseline_lambda=0.2)\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "2cd999f2-cd1f-40a2-bd64-bb8adc3f738b",
   "metadata": {},
   "outputs": [],
   "source": [
    "instruction_difficulty = instruction_difficulty.squeeze().sort_index().rename(index=\"instruction_difficulty\")\n",
    "instruction_difficulty.index.name=\"index\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b89dccf-3f2d-4540-954e-1043ce8ba933",
   "metadata": {},
   "source": [
    "Now let's compare the results with the insturction_difficulty computed on fewer models in early 2024, i.e., the instruction_difficulty that we actually use in AlpacaEval LC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "02776f33-509a-4eab-8a48-3c267edde467",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import hf_hub_download\n",
    "out = hf_hub_download(repo_id=\"tatsu-lab/alpaca_eval\", \n",
    "                filename=\"instruction_difficulty.csv\",\n",
    "                repo_type=\"dataset\",\n",
    "                force_download=True,\n",
    "                cache_dir=constants.DEFAULT_CACHE_DIR)\n",
    "               \n",
    "hf_instruction_difficulty = pd.read_csv(out, index_col=0).squeeze()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "cddae7c7-8bc6-4db2-bd33-bdfa378902d5",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/yanndubois/opt/anaconda3/envs/alpaca_eval/lib/python3.11/site-packages/scipy/stats/_stats_py.py:4424: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined.\n",
      "  warnings.warn(stats.ConstantInputWarning(msg))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "{'verbosity_gameability': 7.454594530315204,\n",
       " 'conciseness_gameability': 15.959179311867668,\n",
       " 'adversarial_rank_gain': 49.0,\n",
       " 'adversarial_winrate_gain': 8.450426719471494,\n",
       " 'corr_arena': 0.9746295267069284,\n",
       " 'corr_len': 0.2731314094063601,\n",
       " 'logloss': 0.2719086188374834,\n",
       " 'mse': 0.051460503983874965,\n",
       " 'r2': 0.41395421815105193,\n",
       " 'corr': 0.6329363823423694,\n",
       " 'acc': 0.891627534425641}"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# make data with join instruciton difficulty\n",
    "df, df_lb = make_data(all_df_annotations, instruction_difficulty=hf_instruction_difficulty)\n",
    "\n",
    "# run the disjoint optimization. That's the only thing that we wil have to actually run\n",
    "metrics, models = disjoint_optimization_(lb, df, df_lb, formula=formula, regularize_to_baseline_lambda=0.2)\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "27914efb-d890-41f1-8020-f1454c46cbca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spearman: 0.953\n",
      "Pearson: 0.958\n"
     ]
    }
   ],
   "source": [
    "from scipy.stats import pearsonr, spearmanr\n",
    "s = spearmanr(instruction_difficulty, hf_instruction_difficulty).statistic\n",
    "r = pearsonr(instruction_difficulty, hf_instruction_difficulty).statistic\n",
    "print(f\"Spearman: {s:.3f}\")\n",
    "print(f\"Pearson: {r:.3f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0e29f843-9117-407f-9099-4044037782ae",
   "metadata": {},
   "source": [
    "We see that, eventhough the insturction_difficulty is computed on a different set of models, the results with both sets of instruction_difficulty are very similar and both instruction_difficulty are highly correlated."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8835d65c-9775-4f5d-a938-989dd0739c44",
   "metadata": {},
   "source": [
    "We see that despite being computed on a different set of models, the instruciton difficulties are highly correlated."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d307cd7-262d-4ce5-8f54-ca521ae7fc90",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
